In [1]:
%pwd
Out[1]:
Load file into a DataFrame
In [2]:
import pandas as pd
names2010 = pd.read_csv('/resources/yob2010.txt', names=['name', 'sex', 'births'])
names2010
Out[2]:
Total number of birth in year 2010 by sex
In [3]:
names2010.groupby('sex').births.sum()
Out[3]:
Insert prop
column for each group
In [4]:
def add_prop(group):
# Integer division floors
births = group.births.astype(float)
group['prop'] = births / births.sum()
return group
names2010 = names2010.groupby(['sex']).apply(add_prop)
In [5]:
names2010
Out[5]:
Verify that the prop
clumn sums to 1 within all the groups
In [7]:
import numpy as np
np.allclose(names2010.groupby(['sex']).prop.sum(), 1)
Out[7]:
Extract a subset of the data with the top 10 names for each sex
In [8]:
def get_top10(group):
return group.sort_index(by='births', ascending=False)[:10]
grouped = names2010.groupby(['sex'])
top10 = grouped.apply(get_top10)
In [9]:
top10.index = np.arange(len(top10))
In [10]:
top10
Out[10]:
Aggregate all birth by the first latter from name
column
In [11]:
# extract first letter from name column
get_first_letter = lambda x: x[0]
first_letters = names2010.name.map(get_first_letter)
first_letters.name = 'first_letter'
table = names2010.pivot_table('births', index=first_letters,
columns=['sex'], aggfunc=sum)
In [12]:
table.head()
Out[12]:
Normalize the table
In [13]:
table.sum()
Out[13]:
In [14]:
letter_prop = table / table.sum().astype(float)
Plot proportion of boys and girls names starting in each letter
In [16]:
%matplotlib inline
import matplotlib.pyplot as plt
fig, axes = plt.subplots(2, 1, figsize=(10, 8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female',
legend=False)
Out[16]: